---
title: "CE Index Measurement"
output: html_document
date: "2025-02-07"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


```{r}
rm(list = ls())
cat("\014")
```

```{r}
library(tidyverse)
library(vroom)
library(ggrepel)
library(ggthemes)
library(ggplot2)
library(dplyr)
library(forcats)
library(zoo)
library(MCMCpack)
```

*Load Data*
```{r}
# Population numbers
# Source: 
pop <- vroom("data/pop.in.th.csv")

state_data <- expand.grid(
  year = 1990:2023,
  state = unique(pop$State)
)

# Energy consumption 
# SourcE: US EIA
energy <- read.csv("data/energy.cons.per.2017.gdp.csv")

# Renewable Energy
# Source: EIA
renew <- vroom("data/energy.prod.cons.csv")

renew <- left_join(renew, pop, by = c("State", "year"))

renew$renew.prod.per.capita <- renew$tot.renew.bn.btu.prod / (renew$pop.in.th)
# Join
state_data <- left_join(state_data, renew, by = c("state" = "State", "year"))


# Innovation / Patents in CPC Y02
# Source: USPTO
pats <- vroom("data/y02.patents.csv")

pop <- pop %>%
  rename(state = State)

############

pats <- pats[, -1]

pats <- pats %>%
  rename(clean.pats = y02.pats)

pats <- left_join(pats, pop, by = c("state", "year"))

pats <- pats %>%
  mutate(pats.per.1k = clean.pats / pop.in.th)
state_data <- left_join(state_data, pats, by = c("state", "year"))

## 

# Annual new companies in clean tech - Crunchbase - per capita 
dat <- vroom("data/firms.csv")

fips <- vroom("data/FIPS_CODEs.csv") %>%
  dplyr::select(statename, stateabbr)

dat <- left_join(dat, fips) %>%
  distinct(stateabbr, year, .keep_all = T) %>%
  rename()

#firms per capita 
#scale 

dat <- left_join(dat, pop, by = c("stateabbr" = "state", "year"))

dat$ce.firms.cr.per.1k <- dat$n / dat$pop.in.th

dat <- dat %>%
  dplyr::select(stateabbr, year, ce.firms.cr.per.1k)

state_data <- left_join(state_data, dat, by = c("state" = "stateabbr", "year"))


#Energy consumption intensity 
en <- vroom("data/energy.cons.per.2017.usd.gdp.csv")
state_data <- left_join(state_data, en, by = c("state" = "State", "year"))

#Carbon intensity 
carbon <- vroom("data/carbon.intensity.csv")
carbon <- carbon %>%
  pivot_longer(cols = 2:27,
               names_to = "year",
               values_to = "carbon.per") %>%
  filter(year > 1997) %>%
  filter(!State %in% c("DC", "US"))

carbon$year <- as.numeric(carbon$year)

state_data <- left_join(state_data, carbon, by = c("state" = "State", "year"))

#Vehicle use 
cars <- vroom("data/car.use.pc.csv")
state_data <- left_join(state_data, cars, by = c("state" = "State", "year"))

# plug ins per capita 

plug <- vroom("data/plug_in_stock.csv")
plug <- plug %>%
  dplyr::select(1:8) %>%
  pivot_longer(2:8, names_to = "year", values_to = "plug.in.stock") %>%
  mutate(year = as.numeric(year)) %>%
  left_join(pop, by = c("State" = "state", "year")) %>%
  mutate(plugins.per = plug.in.stock / pop.in.th) %>%
  dplyr::select(State, year, plugins.per)

state_data <- left_join(state_data, plug, by = c("state" = "State", "year"))


#additional emissions and sinks data 
epa <- vroom("data/epa.data.csv")

test <- epa %>%
  pivot_longer(19:51, names_to = "year", values_to = "value")

test$year <- as.numeric(gsub("Y", "", test$year))

test <- test %>%
  group_by(econ_sector, geo_ref, year) %>%
  summarise(co2_value = sum(value))

test <- test %>%
  filter(!econ_sector == "U.S. Territories") %>%
  filter(!geo_ref %in% c("National", "DC"))

test <- left_join(pop, test, by = c("state" = "geo_ref", "year")) 

test <- test %>%
  filter(year > 1997) %>%
  mutate(co2.per.1k = co2_value / pop.in.th)

test <- test %>%
  dplyr::select(-co2_value) %>%
  pivot_wider(names_from = econ_sector, values_from = co2.per.1k)
# units in Tg? teragrams of CO2-equivalent


test$net.change.sinks <- c(NA, diff(test$`LULUCF Sector Net Total`))
  
test <- test %>%
  dplyr::select(-pop.in.th, -`LULUCF Sector Net Total`, `NA`)

test <- test %>%
  dplyr::select(-'NA')

colnames(test)[4:10] <- paste0(colnames(test)[4:10], ".tg.per.1k")

#join
state_data <- left_join(state_data, test, by = c("state" = "state", "year"))

#Rhodium investment data 

# Actual investments by state, technology, and quarter (2023 million USD)

rho <- vroom("data/rhodium.investment.data.csv")

df_annual <- rho %>%
  mutate(year = sub("-Q[1-4]", "", quarter)) %>% # Extract the year
  group_by(year, State) %>%
  summarize(annual.IRA.inv.mn.2023 = sum(Estimated_Actual_Quarterly_Expenditure, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(year = as.numeric(year)) %>%
  left_join(pop, by = c("State" = "state", "year")) %>%
  mutate(IRA.inv.2023.th.pc = annual.IRA.inv.mn.2023 / pop.in.th) 

state_data <- left_join(state_data, df_annual, by = c("state" = "State", "year"))

# Carbon intensity

carbon.intensity <- vroom("data/carbon.intensity.csv")

carbon.intensity <- carbon.intensity %>%
  pivot_longer(2:27, names_to = "year", values_to = "mt.co2.per.mn.2017.gdp") %>%
  mutate( year = as.numeric(year)) %>%
  mutate(statename = tolower(State))

fips <- vroom("data/FIPS_CODES.csv")

carbon.intensity <- carbon.intensity %>%
  left_join(fips, by = "statename") %>% 
  rename(state = stateabbr) %>%
  dplyr::select(state, year, mt.co2.per.mn.2017.gdp) %>%
  distinct(state, year, .keep_all = T)
  
state_data <- left_join(state_data, carbon.intensity, by = c("state", "year"))


# metric tons of energy-related carbon dioxide per chained 2017 million dollars of GDP

ff.gdp <- vroom("data/industry.gdp.data.csv") %>%
  dplyr::select(state, year, ff.gdp.perc)
 
state_data <- left_join(state_data, ff.gdp, by = c("state", "year"))

# Domestic material consumption - downscaled from natioanl values based on 
# state-level population, gdp, and heavy industry shares - using 0.2, 0.3, and 0.5 weights

dmc <- vroom("data/high.industry.weight.dmc.csv")
state_data <- left_join(state_data, dmc, by = c("state", "year"))

#th.mt.mat.cons.per.mn.2017.gdp

# Renewables in electricty production

ec <- vroom("data/elec.perc.renew.hydro.csv")
ec$statename <- tolower(ec$state)
ec <- left_join(ec, fips, by = "statename")
ec <- ec %>%
  dplyr::select(stateabbr, year, elec.perc.renew.hydro.nuclear) %>%
  rename(state = stateabbr) %>%
  distinct(state, year, .keep_all = T)

# Percent low-carbon sources in electricity GENERATION - nuclear, hydro, and renewables 

state_data <- left_join(state_data, ec, by = c("state", "year"))

# Personal spending on goods as percentage of personal expenditure
exp <- vroom("data/expenditures.perc.gdp.csv")

state_data <- left_join(state_data, exp, by = c("state", "year"))

#Share of personal spending on physical goods (as opposed to services)
exp_2 <- vroom("data/goods.share.in.consumption.csv")
state_data <- left_join(state_data, exp_2, by = c("state", "year"))

#get data from EIA 
ev <- read.csv("data/ev_share.csv")
colnames(ev) <- gsub("^X", "", colnames(ev))

ev <- ev %>%
  pivot_longer(2:9, names_to = "year", values_to = "ev.share.perc")

ev$year <- as.numeric(ev$year)

state_data <- left_join(state_data, ev, by = c("state" = "State", "year"))

```

*Clean up*
```{r}
model_data <- state_data %>%
  filter(year >= 2000 & year < 2023) %>%
  filter(!state %in% c("US", "DC")) %>%
  dplyr::select(state, year, 
                perc.renew.energy.prod.bn.btu, 
                perc.renew.energy.cons,
                pats.per.1k, 
                ce.firms.cr.per.1k,
                energy.cons.per.2017.usd.gdp, 
                Agriculture.tg.per.1k, 
                Commercial.tg.per.1k, 
                `Electric Power Industry.tg.per.1k`, 
                Industry.tg.per.1k, 
                Residential.tg.per.1k, 
                Transportation.tg.per.1k, 
                net.change.sinks.tg.per.1k, 
                IRA.inv.2023.th.pc, 
                car.use.pc, 
                plugins.per,
                ev.share.perc, 
                mt.co2.per.mn.2017.gdp,
                ff.gdp.perc, 
                th.mt.mat.cons.per.mn.2017.gdp, 
                elec.perc.renew.hydro.nuclear, 
                goods.spending.as.perc.mn.2017.gdp)

```

*Impute NAs*

```{r}
# NAs in the patent data refer to zero counts 
model_data$pats.per.1k[is.na(model_data$pats.per.1k)] <- 0

# The Rhodium data begins in 2018. I set the value for prior periods to zero. 
# Data on the EV share begins in 2016. I set the EV share prior to that to zero 
# Same for plug-in vehicles
model_data <- model_data %>%
  mutate(IRA.inv.2023.th.pc = ifelse(year < 2018, 0, IRA.inv.2023.th.pc)) %>%
  mutate(ev.share.perc = ifelse(year < 2016, 0, ev.share.perc)) %>%
  mutate(plugins.per = ifelse(year < 2016, 0, plugins.per))


# In all other cases, I Impute NAs based on last known value

model_data <- model_data %>%
  arrange(state, year) %>% 
  group_by(state) %>%      
  mutate(across(
    c(3:22),
    ~ na.locf(na.locf(.x, na.rm = FALSE), fromLast = TRUE)  
  )) %>%
  ungroup()

```

*Scale all variables*
```{r}

model_data[, 3:ncol(model_data)] <- scale(model_data[, 3:ncol(model_data)])

```

*Select variables for model* 
```{r}
model_data <- model_data %>%
  dplyr::select(state, year, perc.renew.energy.prod.bn.btu, 
                perc.renew.energy.cons, pats.per.1k, 
                ce.firms.cr.per.1k,energy.cons.per.2017.usd.gdp, 
                mt.co2.per.mn.2017.gdp, ff.gdp.perc,plugins.per,
                th.mt.mat.cons.per.mn.2017.gdp, car.use.pc,
                elec.perc.renew.hydro.nuclear, plugins.per, Agriculture.tg.per.1k, 
                Commercial.tg.per.1k, `Electric Power Industry.tg.per.1k`, 
                Industry.tg.per.1k, Residential.tg.per.1k, 
                Transportation.tg.per.1k, net.change.sinks.tg.per.1k,
                ev.share.perc, IRA.inv.2023.th.pc,
                goods.spending.as.perc.mn.2017.gdp) 

numeric_data <- as.matrix(model_data[3:23])

```



*Run BFA Model*

```{r}

fit <- MCMCfactanal(
  x = numeric_data,
  factors = 1,                     
  lambda.constraints = list(       # Only constraining two variables according to theoretical expectations 
    perc.renew.energy.cons = list(1, "+"),
    mt.co2.per.mn.2017.gdp = list(1, "-")
  ),
  burnin = 1000,                  
  mcmc = 15000,                    # 15,000 iterations
  thin = 1,                        
  store.scores = TRUE,             
  verbose = 0                     
)

```

*Extract the factor scores from the fitted model* 

```{r}
library(stringr)
library(tibble)

tidy_mcmcfa <- function(fit, z = 1.96) {

  stats <- data.frame(summary(fit)[[1]]) %>%
    rownames_to_column("term")
  

  scores <- stats %>%
    filter(str_detect(term, "^phi[._]")) %>%
    mutate(
      unit   = str_remove(term, "^phi[._]"),
      cilow  = Mean - z * Naive.SE,
      cihigh = Mean + z * Naive.SE
    ) %>%
    relocate(unit, .before = term) %>%
    dplyr::select(-term)


  loadings <- stats %>%
    filter(!str_detect(term, "^phi[._]")) %>%
    mutate(
      parameter = if_else(str_starts(term, "Lambda"), "discrimination", "difficulty"),
      variable  = term %>%
        str_remove("^Lambda") %>%
        str_remove("^Psi")    %>%
        str_remove("_1$"),
      cilow  = Mean - z * Naive.SE,
      cihigh = Mean + z * Naive.SE
    ) %>%
    dplyr::select(variable, parameter, Mean, Naive.SE, cilow, cihigh, term, SD)

  list(scores = scores, loadings = loadings)
}


res <- tidy_mcmcfa(fit)
scores  <- res$scores
summary <- res$loadings

```


*Put results into dataframe*

```{r}
state_year_results <- cbind(
  model_data[, c("state", "year")],  
  Latent_Factor = scores$Mean        
)

state_year_results <- state_year_results %>%
  distinct(state, year, .keep_all = T)

test <- left_join(state_data, state_year_results, by = c("state" = "state", "year"))

```

*Check Discrim Parameters* 

```{r}
ggplot(summary[summary$parameter == "discrimination",], 
       aes(x = reorder(variable, Mean), y = Mean)) +
  geom_point() +
  geom_errorbar(aes(ymin = (Mean - 1.96 * SD), 
                    ymax = (Mean + 1.96 * SD)), width = 0) +
  xlab("") +
  ylab("Discrimination Parameter") +
  coord_flip() +
  geom_hline(yintercept = 0, linetype = 2, alpha = 0.7) +
  theme_clean()
```


*Visualize discrims*

```{r}
ggplot(summary[summary$parameter == "discrimination",], 
       aes(x = reorder(variable, Mean), y = Mean)) + 
  geom_point() +
  geom_errorbar(aes(ymin = (Mean - 1.96 * SD), 
                    ymax = (Mean + 1.96 * SD)), width = 0) +
  xlab("") +
  ylab("Discrimination Parameter") +
  coord_flip() +
  geom_hline(yintercept = 0, linetype = 2, alpha = 0.4) +
  theme_clean() +
  scale_x_discrete(labels = c("CO2e Emissions / GDP", 
                              "Energy Consumption / GDP", 
                              "Industry CO2e PC", 
                              "Transportation Emissions PC", 
                              "Electric Power Industry CO2e PC",
                              "Fossil Fuels / GDP",
                              "Car Use Intensity", 
                              "Total Domestic Material Consumption / GDP",
                              "Commercial CO2e PC",
                              "Agriculture CO2e PC",
                              "Goods Share in Personal Spending",
                              "Net Land Use Change CO2e PC",
                              "Residential CO2e PC", 
                              "Green Industry Investments PC", 
                              "Green Industry Firms PC", 
                              "% Renewables of Energy Consumption",
                              "% Electric Vehicles",
                              "Hybrid Vehicles PC",
                              "% Renewables of Energy Production", 
                              "Green Patents PC",
                              "Clean Energy Share in Electricity"
                              
                              )) # Custom labels


ggsave("discrims.final.png", 
       width = 8, height = 8/1.618, units = "in")
```


```{r}
state_year_results <- cbind(
  model_data[, c("state", "year")],  # Select state and year columns
  Latent_Factor = scores$Mean           # Add latent factor scores
)

state_year_results <- state_year_results %>%
  distinct(state, year, .keep_all = T)


test <- left_join(state_data, state_year_results, by = c("state" = "state", "year"))

write.csv(test, "final.data.csv")
```



**ROBUSTNESS CHECKS**

*Check impact of single indicators on overall index* 

*Iteratively remove one variable and check correlation with base model*


```{r}
all_vars <- colnames(numeric_data)


constrained_vars <- c("perc.renew.energy.cons", "mt.co2.per.mn.2017.gdp")

```

```{r}
robustness_scores <- list()

for (v in all_vars) {
  cat("Dropping:", v, "\n")
  
  data_subset <- numeric_data[, !(colnames(numeric_data) == v)]
  
  remaining_constraints <- constrained_vars[constrained_vars != v]
  
  if (length(remaining_constraints) == 0) {
    fallback_var <- colnames(data_subset)[1]
    lambda_constraints <- list()
    lambda_constraints[[fallback_var]] <- list(1, "+")
  } else {
    lambda_constraints <- list()
    for (var in remaining_constraints) {
      direction <- ifelse(var == "perc.renew.energy.cons", "+", "-")
      lambda_constraints[[var]] <- list(1, direction)
    }
  }
  
  fit_lo <- MCMCfactanal(
    x = data_subset,
    factors = 1,
    lambda.constraints = lambda_constraints,
    burnin = 1000,
    mcmc = 15000,
    thin = 1,
    store.scores = TRUE,
    verbose = 0
  )
  
  lo_summary <- summary(fit_lo)$statistics
  lo_scores <- lo_summary[grepl("phi.", rownames(lo_summary)), "Mean"]
  
  robustness_scores[[v]] <- lo_scores
}

```

```{r}
# Original factor scores
original_scores <- scores$Mean

# Correlation between full and leave-one-out
cor_results <- sapply(robustness_scores, function(scores_lo) {
  cor(original_scores, scores_lo, use = "complete.obs")
})

print(round(cor_results, 3))

```



*Robustness Check* 

**Create Core index based on minimal inputs**

*Select variables for model* 

```{r}

small_data <- model_data %>%
  dplyr::select(state, year, 
                perc.renew.energy.prod.bn.btu, 
                perc.renew.energy.cons, 
                energy.cons.per.2017.usd.gdp, 
                mt.co2.per.mn.2017.gdp, 
                ff.gdp.perc,
                th.mt.mat.cons.per.mn.2017.gdp, 
                                ) 

numeric_data <- as.matrix(small_data[3:8])
```

*Run BFA Model*

```{r}

fit <- MCMCfactanal(
  x = numeric_data,
  factors = 1,                     
  lambda.constraints = list(       
    perc.renew.energy.cons = list(1, "+"),
    mt.co2.per.mn.2017.gdp = list(1, "-")
  ),
  burnin = 1000,                   
  mcmc = 15000,                     
  thin = 1,                        
  store.scores = TRUE,             
  verbose = 0                     
)

res <- tidy_mcmcfa(fit)
scores  <- res$scores
summary <- res$loadings

```


*Extract results*

```{r}
state_year_results <- cbind(
  model_data[, c("state", "year")],  
  Core_index = scores$Mean        
)

state_year_results <- state_year_results %>%
  distinct(state, year, .keep_all = T)

test <- left_join(state_data, state_year_results, by = c("state" = "state", "year"))

```

```{r}

test <- test %>%
  dplyr::select(state, year, Core_index)

write.csv(test, "core.index.csv")
```




